# TABLE 3 CODE

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import sys
sys.path.insert(1, '/REDACTED/fairness/code/utilities')
import UncertaintySimulation as unc

def regEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited', wvarb = None):
    if wvarb is not None:
        model =  smf.wls(outcome + ' ~ ' + pbvarb, dataset, weights=dataset[wvarb]).fit(cov_type='HC1')
        coef= model.params[pbvarb]
        se = model.bse[pbvarb]
        black_rate = model.params[0] + model.params[1]
        non_black_rate = model.params[0]
        return coef, se, black_rate, non_black_rate
    else:
        model =  smf.ols(outcome + ' ~ ' + pbvarb, dataset).fit(cov_type='HC1')
        coef= model.params[pbvarb]
        se = model.bse[pbvarb]
        black_rate = model.params[0] + model.params[1]
        non_black_rate = model.params[0]
        return coef, se, black_rate, non_black_rate

## Probabilistic Estimator
def chenEstimate(dataset, pbvarb='pBlack', outcome='audited', wvarb=None):
    if wvarb is not None:
        black_rate = (dataset[pbvarb]*dataset[outcome]*dataset[wvarb]).sum()/(dataset[pbvarb]*dataset[wvarb]).sum()
        non_black_rate = ((1-dataset[pbvarb])*dataset[outcome]*dataset[wvarb]).sum()/((1-dataset[pbvarb])*dataset[wvarb]).sum()
        est = black_rate - non_black_rate
        return est, black_rate, non_black_rate
    else:
        black_rate = (dataset[pbvarb]*dataset[outcome]).sum()/(dataset[pbvarb]).sum()
        non_black_rate = ((1-dataset[pbvarb])*dataset[outcome]).sum()/(1-dataset[pbvarb]).sum()
        est = black_rate - non_black_rate
        return est, black_rate, non_black_rate


full_research_audits_df = pd.read_csv('/REDACTED/fairness/code/rf/data/clean_rf_data_plus_dep_database.csv')
#full_research_audits_df = pd.read_csv('/REDACTED/clean_rf_data_plus_dep_database_7_28.csv')
full_research_audits_df = full_research_audits_df[(full_research_audits_df.activity_code == 270) | (full_research_audits_df.activity_code == 271)]

full_research_audits_df.columns = full_research_audits_df.columns.str.lower()

sys.stdout = open("/REDACTED/error_stats_by_race.txt", "w")
full_research_audits_df['claim_dep_ind'] = np.where(full_research_audits_df.child_exemption_home_cd > 0, 1, 0*full_research_audits_df.child_exemption_home_cd)
partial_research_audits_df = full_research_audits_df[full_research_audits_df.claim_dep_ind.notnull()]

# first row
row_1_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'claim_dep_ind', 'base_weight')
print("Claudit_data Dependents:")
print("Prob")
print("Black: " + str(round(100*row_1_prob[1], 1)))
print("Non-Black: " + str(round(100*row_1_prob[2], 1)))
#chenEstimate(partial_research_audits_df, 'predicted_prob_black', 'claim_dep_ind', 'base_weight')
## ran a check if we remove missings-- diff result! 51 missing here

row_1_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'claim_dep_ind', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_1_lin[2], 1)))
print("Non-Black: " + str(round(100*row_1_lin[3], 1)))
#regEstimate(partial_research_audits_df, 'predicted_prob_black', 'claim_dep_ind', 'base_weight')

# second row
row_2_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'dep_reduced_ind', 'base_weight')
print("\nDependent Error Rate:")
print("Prob")
print("Black: " + str(round(100*row_2_prob[1], 1)))
print("Non-Black: " + str(round(100*row_2_prob[2], 1)))
#chenEstimate(partial_research_audits_df, 'predicted_prob_black', 'dep_reduced_ind', 'base_weight')

row_2_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'dep_reduced_ind', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_2_lin[2], 1)))
print("Non-Black: " + str(round(100*row_2_lin[3], 1)))
#regEstimate(partial_research_audits_df, 'predicted_prob_black', 'dep_reduced_ind', 'base_weight')

# 3rd row
#full_research_audits_df.changed_from_hoh.value_counts()
#full_research_audits_df.changed_from_hoh.isnull().sum()

row_6_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'changed_from_hoh', 'base_weight')
print("\nHead of Household Error:")
print("Prob")
print("Black: " + str(round(100*row_6_prob[1], 1)))
print("Non-Black: " + str(round(100*row_6_prob[2], 1)))

row_6_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'changed_from_hoh', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_6_lin[2], 1)))
print("Non-Black: " + str(round(100*row_6_lin[3], 1)))


########### SCH C STUFF

sch_c = pd.read_csv('/REDACTED/data/raw/research_audits_sch_c_varbs.csv')
sch_c = sch_c.rename(columns={'taxpayer_id':'taxpayer_id_new', 'study_year_c':'study_year'})
full_research_audits_df = full_research_audits_df.merge(sch_c, on=['taxpayer_id_new', 'study_year'], how='left')
full_research_audits_df.columns = [x.lower() for x in full_research_audits_df.columns]

# 4th row
full_research_audits_df['pos_sch_c_income'] = np.where(full_research_audits_df.net_income_amt > 0, 1, 0)

row_4_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'pos_sch_c_income', 'base_weight')
print("\nAny Business Income:")
print("Prob")
print("Black: " + str(round(100*row_4_prob[1], 1)))
print("Non-Black: " + str(round(100*row_4_prob[2], 1)))

row_4_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'pos_sch_c_income', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_4_lin[2], 1)))
print("Non-Black: " + str(round(100*row_4_lin[3], 1)))

# 5th row
# Construct outcome variable- sch c underreportaxpayer_idg
full_research_audits_df['net_income_adj_c'] = full_research_audits_df.net_income_amt_cor - full_research_audits_df.net_income_amt
full_research_audits_df['underreport'] = np.where(full_research_audits_df.net_income_adj_c > 0, 1, 0)

row_5_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'underreport', 'base_weight')
print("\nBusiness Income is Underreported:")
print("Prob")
print("Black: " + str(round(100*row_5_prob[1], 1)))
print("Non-Black: " + str(round(100*row_5_prob[2], 1)))

row_5_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'underreport', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_5_lin[2], 1)))
print("Non-Black: " + str(round(100*row_5_lin[3], 1)))


########## 90th percentile, TAKE 3 HERE
# Testaxpayer_idg
x = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['net_income_adj_c'])
wt = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 90
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        #print(alpha)
        percentile_90 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_90, 1, 0)
#full_research_audits_df[full_research_audits_df.mistake == 1].base_weight.sum() / full_research_audits_df.base_weight.sum()

underreporters = full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]
#underreporters[underreporters.mistake == 1].base_weight.sum() / underreporters.base_weight.sum()


row_7_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("\nUnderreported Business Income Among Top 10%:")
print("Prob")
print("Black: " + str(round(100*row_7_prob[1], 2)))
print("Non-Black: " + str(round(100*row_7_prob[2], 2)))

row_7_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_7_lin[2], 2)))
print("Non-Black: " + str(round(100*row_7_lin[3], 2)))

########## 95th percentile
# Testaxpayer_idg
x = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['net_income_adj_c'])
wt = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 95
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        #print(alpha)
        percentile_95 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_95, 1, 0)
#full_research_audits_df[full_research_audits_df.mistake == 1].base_weight.sum() / full_research_audits_df.base_weight.sum()

underreporters = full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]
#underreporters[underreporters.mistake == 1].base_weight.sum() / underreporters.base_weight.sum()

row_8_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("\nUnderreported Business Income Among Top 5%:")
print("Prob")
print("Black: " + str(round(100*row_8_prob[1], 2)))
print("Non-Black: " + str(round(100*row_8_prob[2], 2)))

row_8_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_8_lin[2], 2)))
print("Non-Black: " + str(round(100*row_8_lin[3], 2)))

########## 99th percentile
# Testaxpayer_idg
x = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['net_income_adj_c'])
wt = np.array(full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 99
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        #print(alpha)
        percentile_99 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_99, 1, 0)
#full_research_audits_df[full_research_audits_df.mistake == 1].base_weight.sum() / full_research_audits_df.base_weight.sum()

underreporters = full_research_audits_df[full_research_audits_df.net_income_adj_c > 0]
#underreporters[underreporters.mistake == 1].base_weight.sum() / underreporters.base_weight.sum()

row_9_prob = chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("\nUnderreported Business Income Among Top 1%:")
print("Prob")
print("Black: " + str(round(100*row_9_prob[1], 2)))
print("Non-Black: " + str(round(100*row_9_prob[2], 2)))

row_9_lin = regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
print("Lin")
print("Black: " + str(round(100*row_9_lin[2], 2)))
print("Non-Black: " + str(round(100*row_9_lin[3], 2)))

sys.stdout = sys.__stdout__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
### OLD METHOD HERE, NOT USED IN PAPER

############# 90th percentile

full_research_audits_df['net_income_adj_c'] = full_research_audits_df['net_income_adj_c'].fillna(0)

# Testaxpayer_idg
x = np.array(full_research_audits_df['net_income_adj_c'])
wt = np.array(full_research_audits_df['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 90
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        #print(alpha)
        percentile_90 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_90, 1, 0)

chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')

############# 95th percentile

# Testaxpayer_idg
x = np.array(full_research_audits_df['net_income_adj_c'])
wt = np.array(full_research_audits_df['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 95
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        print(alpha)
        percentile_95 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_95, 1, 0)

full_research_audits_df[full_research_audits_df.mistake == 1].base_weight.sum() / full_research_audits_df.base_weight.sum()

chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')

############# 99th percentile

# Testaxpayer_idg
x = np.array(full_research_audits_df['net_income_adj_c'])
wt = np.array(full_research_audits_df['base_weight'])

sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_wt = wt[sorted_indices]

cumulative_weights = np.cumsum(sorted_wt)
total_weight = np.sum(sorted_wt)

target_percentile = 99
target_weight = total_weight * target_percentile / 100

index = np.searchsorted(cumulative_weights, target_weight, side='right')

if index > 0:
    if index < len(sorted_x): #shouldn't we always be in this case?
        alpha = (target_weight - cumulative_weights[index - 1]) / (cumulative_weights[index] - cumulative_weights[index - 1])
        print(alpha)
        percentile_99 = sorted_x[index - 1] + alpha * (sorted_x[index] - sorted_x[index - 1])

full_research_audits_df['mistake'] = np.where(full_research_audits_df.net_income_adj_c > percentile_99, 1, 0)

full_research_audits_df[full_research_audits_df.mistake == 1].base_weight.sum() / full_research_audits_df.base_weight.sum()

chenEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')
regEstimate(full_research_audits_df, 'predicted_prob_black', 'mistake', 'base_weight')


######## Percentile cutoffs JUST among those with positive Sch C
full_research_audits_df['net_income_adj_c'] = full_research_audits_df.net_income_amt_cor - full_research_audits_df.net_income_amt